\chapter{第一个程序}\label{apdx:第一个程序}
\section{主核}
\begin{lstlisting}
#include <stdlib.h>
#include <stdio.h>
#include <athread.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

extern SLAVE_FUN(func)();

#define X 64
#define Y 2048

int A[X][Y], B[X][Y], C[X][Y],CC[X][Y];

void init()
{
    int i, j;
    for (i = 0; i < X; i++)
        for (j = 0; j < Y; j++)
        {
            A[i][j] = i + j;
            B[i][j] = i + j + 1;
            C[i][j] = 0;
            CC[i][j]=0;
        }
    printf("Init finished\n");
}

int main(void)
{
    int i, j;
    struct timeval start, end;
    init();
    double time_use;
    printf("No boost proceed:\n");
    gettimeofday(&start, NULL);
    for (i = 0; i < X; i++)
        for (j = 0; j < Y; j++)
            C[i][j] = A[i][j] + B[i][j];
    gettimeofday(&end, NULL);
    time_use = 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec;
    printf("Time usage is %lf us\n", time_use);
    printf("(C[32][0], C[63][999]) = (%d, %d)\n", C[32][0], C[63][999]);
    init();
    athread_init();
    printf("Boosted proceed:\n");
    gettimeofday(&start, NULL);
    athread_spawn(func, 0);
    athread_join();
    gettimeofday(&end, NULL);
    time_use = 1000000 * (end.tv_sec - start.tv_sec) + end.tv_usec - start.tv_usec;
    printf("Time usage is %lf us\n", time_use);
    printf("(C[32][0], C[63][999]) = (%d, %d)\n", C[32][0], C[63][999]);
    athread_halt();
    return 0;
}
\end{lstlisting}

\section{从核}
\begin{lstlisting}
#include "slave.h"
#define X 64
#define Y 2048

__thread_local int my_id;
__thread_local volatile unsigned long get_reply, put_reply;
__thread_local int A_slave[Y], B_slave[Y], C_slave[Y];
extern int A[X][Y], B[X][Y], C[X][Y];

void func()
{
    int i;
    my_id = athread_get_id(-1);
    get_reply = 0;
    put_reply = 0;
    athread_get(PE_MODE, &A[my_id][0], A_slave, Y * 4, &get_reply, 0, 0, 0);
    athread_get(PE_MODE, &B[my_id][0], B_slave, Y * 4, &get_reply, 0, 0, 0);
    while (get_reply != 2)
        ;
    for (i = 0; i < Y; i++)
    {
        C_slave[i] = A_slave[i] + B_slave[i];
    }
    athread_put(PE_MODE, C_slave, &C[my_id][0], Y * 4, &put_reply, 0, 0);
    while (put_reply != 1)
        ;
}
\end{lstlisting}

\section{经过SIMD优化后的从核}\label{apdx:优化后的第一个程序从核}
\begin{lstlisting}
#include "simd.h"
#include "slave.h"
#define X 64
#define Y 2048

__thread_local int my_id;
__thread_local volatile unsigned long get_reply, put_reply;
__thread_local int A_slave[Y], B_slave[Y], C_slave[Y];
extern int A[X][Y], B[X][Y], C[X][Y];

void func()
{
    int i;
    intv8 v,va;
    my_id = athread_get_id(-1);
    get_reply = 0;
    put_reply = 0;
    athread_get(PE_MODE, &A[my_id][0], A_slave, Y * 4, &get_reply, 0, 0, 0);
    athread_get(PE_MODE, &B[my_id][0], B_slave, Y * 4, &get_reply, 0, 0, 0);
    while (get_reply != 2)
        ;
    for (i = 0; i < Y; i+=8)
    {
        simd_load(va,&(A_slave[i]));
        simd_load(v,&(B_slave[i]));
        v=v+va;
        simd_store(v,&(C_slave[i]));
    }
    athread_put(PE_MODE, C_slave, &C[my_id][0], Y * 4, &put_reply, 0, 0);
    while (put_reply != 1)
        ;
}
\end{lstlisting}

\chapter{Darknet的Makefile}\label{apdx:Darknet的Makefile}
\begin{lstlisting}[language=make]
GPU=0
CUDNN=0
OPENCV=0
OPENMP=0
DEBUG=0

ARCH= -gencode arch=compute_30,code=sm_30 \
        -gencode arch=compute_35,code=sm_35 \
        -gencode arch=compute_50,code=[sm_50,compute_50] \
        -gencode arch=compute_52,code=[sm_52,compute_52]
#      -gencode arch=compute_20,code=[sm_20,sm_21] \ This one is deprecated?

# This is what I use, uncomment if you know your arch and want to specify
# ARCH= -gencode arch=compute_52,code=compute_52

VPATH=./src/:./examples
SLIB=libdarknet.so
ALIB=libdarknet.a
EXEC=darknet
OBJDIR=./obj/

CC=gcc
CPP=g++
NVCC=nvcc 
AR=ar
ARFLAGS=rcs
OPTS=-Ofast
LDFLAGS= -lm -pthread 
COMMON= -Iinclude/ -Isrc/
CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -fPIC

ifeq ($(OPENMP), 1) 
CFLAGS+= -fopenmp
endif

ifeq ($(DEBUG), 1) 
OPTS=-O0 -g
endif

CFLAGS+=$(OPTS)

ifeq ($(OPENCV), 1) 
COMMON+= -DOPENCV
CFLAGS+= -DOPENCV
LDFLAGS+= `pkg-config --libs opencv` -lstdc++
COMMON+= `pkg-config --cflags opencv` 
endif

ifeq ($(GPU), 1) 
COMMON+= -DGPU -I/usr/local/cuda/include/
CFLAGS+= -DGPU
LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
endif

ifeq ($(CUDNN), 1) 
COMMON+= -DCUDNN 
CFLAGS+= -DCUDNN
LDFLAGS+= -lcudnn
endif

OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o detection_layer.o route_layer.o upsample_layer.o box.o normalization_layer.o avgpool_layer.o layer.o local_layer.o shortcut_layer.o logistic_layer.o activation_layer.o rnn_layer.o gru_layer.o crnn_layer.o demo.o batchnorm_layer.o region_layer.o reorg_layer.o tree.o  lstm_layer.o l2norm_layer.o yolo_layer.o iseg_layer.o image_opencv.o
EXECOBJA=captcha.o lsd.o super.o art.o tag.o cifar.o go.o rnn.o segmenter.o regressor.o classifier.o coco.o yolo.o detector.o nightmare.o instance-segmenter.o darknet.o
ifeq ($(GPU), 1) 
LDFLAGS+= -lstdc++ 
OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o avgpool_layer_kernels.o
endif

EXECOBJ = $(addprefix $(OBJDIR), $(EXECOBJA))
OBJS = $(addprefix $(OBJDIR), $(OBJ))
DEPS = $(wildcard src/*.h) Makefile include/darknet.h

all: obj backup results $(SLIB) $(ALIB) $(EXEC)
#all: obj  results $(SLIB) $(ALIB) $(EXEC)


$(EXEC): $(EXECOBJ) $(ALIB)
    $(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(ALIB)

$(ALIB): $(OBJS)
    $(AR) $(ARFLAGS) $@ $^

$(SLIB): $(OBJS)
    $(CC) $(CFLAGS) -shared $^ -o $@ $(LDFLAGS)

$(OBJDIR)%.o: %.cpp $(DEPS)
    $(CPP) $(COMMON) $(CFLAGS) -c $< -o $@

$(OBJDIR)%.o: %.c $(DEPS)
    $(CC) $(COMMON) $(CFLAGS) -c $< -o $@

$(OBJDIR)%.o: %.cu $(DEPS)
    $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@

obj:
    mkdir -p obj
backup:
    mkdir -p backup
results:
    mkdir -p results

.PHONY: clean

clean:
    rm -rf $(OBJS) $(SLIB) $(ALIB) $(EXEC) $(EXECOBJ) $(OBJDIR)/*    
\end{lstlisting}

\chapter{移植后Darknet的Makefile文件}\label{apdx:移植后Darknet的Makefile文件}
\begin{lstlisting}[language=make]
DEBUG=0

VPATH=./src/:./examples
ALIB=libdarknet.a
EXEC=darknet
OBJDIR=./obj/

CC=sw5cc
CPP=sw5CC
AR=ar
ARFLAGS=rcs
OPTS= #-Ofast
LDFLAGS= -lm -lpthread
COMMON= -Iinclude/ -Isrc/ -fPIC -Wall
CFLAGS= -host
HFLAGS= -hybrid

ifeq ($(DEBUG), 1) 
OPTS=-O0 -g
endif

CC+=$(COMMON)
CPP+=$(COMMON)
CFLAGS+=$(OPTS)
HFLAGS+=$(OPTS)

OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o detection_layer.o route_layer.o upsample_layer.o box.o normalization_layer.o avgpool_layer.o layer.o local_layer.o shortcut_layer.o logistic_layer.o activation_layer.o rnn_layer.o gru_layer.o crnn_layer.o demo.o batchnorm_layer.o region_layer.o reorg_layer.o tree.o  lstm_layer.o l2norm_layer.o yolo_layer.o iseg_layer.o image_opencv.o
EXECOBJA=captcha.o lsd.o super.o art.o tag.o cifar.o go.o rnn.o segmenter.o regressor.o classifier.o coco.o yolo.o detector.o nightmare.o instance-segmenter.o darknet.o

EXECOBJ = $(addprefix $(OBJDIR), $(EXECOBJA))
OBJS = $(addprefix $(OBJDIR), $(OBJ))
DEPS = $(wildcard src/*.h) Makefile include/darknet.h

all: obj backup results $(ALIB) $(EXEC)
#all: obj  results $(ALIB) $(EXEC)


$(EXEC): $(EXECOBJ) $(ALIB)
	$(CC) $(HFLAGS) $^ -o $@ $(LDFLAGS) $(ALIB)

$(ALIB): $(OBJS)
	$(AR) $(ARFLAGS) $@ $^

$(OBJDIR)%.o: %.cpp $(DEPS)
	$(CPP) $(CFLAGS) -c $< -o $@ $(LDFLAGS)

$(OBJDIR)%.o: %.c $(DEPS)
	$(CC) $(CFLAGS) -c $< -o $@ $(LDFLAGS)

obj:
	mkdir -p obj
backup:
	mkdir -p backup
results:
	mkdir -p results
run: $(EXEC)
	bsub -I -b -q q_sw_expr -n 1 -cgsp 64 ./darknet go

.PHONY: clean

clean:
	rm -rf $(OBJS) $(ALIB) $(EXEC) $(EXECOBJ) $(OBJDIR)/*
\end{lstlisting}